home *** CD-ROM | disk | FTP | other *** search
Wrap
# Source Generated with Decompyle++ # File: in.pyc (Python 2.6) '''Beautiful Soup Elixir and Tonic "The Screen-Scraper\'s Friend" http://www.crummy.com/software/BeautifulSoup/ Beautiful Soup parses a (possibly invalid) XML or HTML document into a tree representation. It provides methods and Pythonic idioms that make it easy to navigate, search, and modify the tree. A well-formed XML/HTML document yields a well-formed data structure. An ill-formed XML/HTML document yields a correspondingly ill-formed data structure. If your document is only locally well-formed, you can use this library to find and process the well-formed part of it. Beautiful Soup works with Python 2.2 and up. It has no external dependencies, but you\'ll have more success at converting data to UTF-8 if you also install these three packages: * chardet, for auto-detecting character encodings http://chardet.feedparser.org/ * cjkcodecs and iconv_codec, which add more encodings to the ones supported by stock Python. http://cjkpython.i18n.org/ Beautiful Soup defines classes for two main parsing strategies: * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific language that kind of looks like XML. * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid or invalid. This class has web browser-like heuristics for obtaining a sensible parse tree in the face of common HTML errors. Beautiful Soup also defines a class (UnicodeDammit) for autodetecting the encoding of an HTML or XML document, and converting it to Unicode. Much of this code is taken from Mark Pilgrim\'s Universal Feed Parser. For more than you ever wanted to know about Beautiful Soup, see the documentation: http://www.crummy.com/software/BeautifulSoup/documentation.html Here, have some legalese: Copyright (c) 2004-2009, Leonard Richardson All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the the Beautiful Soup Consortium and All Night Kosher Bakery nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT. ''' from __future__ import generators __author__ = 'Leonard Richardson (leonardr@segfault.org)' __version__ = '3.1.0.1' __copyright__ = 'Copyright (c) 2004-2009 Leonard Richardson' __license__ = 'New-style BSD' import codecs import markupbase import types import re from HTMLParser import HTMLParser, HTMLParseError try: from htmlentitydefs import name2codepoint except ImportError: name2codepoint = { } try: set except NameError: from sets import Set as set markupbase._declname_match = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*\\s*').match DEFAULT_OUTPUT_ENCODING = 'utf-8' def sob(unicode, encoding): '''Returns either the given Unicode string or its encoding.''' if encoding is None: return unicode return unicode.encode(encoding) class PageElement: '''Contains the navigational information for some part of the page (either a tag or a piece of text)''' def setup(self, parent = None, previous = None): '''Sets up the initial relations between this element and other elements.''' self.parent = parent self.previous = previous self.next = None self.previousSibling = None self.nextSibling = None if self.parent and self.parent.contents: self.previousSibling = self.parent.contents[-1] self.previousSibling.nextSibling = self def replaceWith(self, replaceWith): oldParent = self.parent myIndex = self.parent.contents.index(self) if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent: index = self.parent.contents.index(replaceWith) if index and index < myIndex: myIndex = myIndex - 1 self.extract() oldParent.insert(myIndex, replaceWith) def extract(self): '''Destructively rips this element out of the tree.''' if self.parent: try: self.parent.contents.remove(self) except ValueError: pass except: None<EXCEPTION MATCH>ValueError None<EXCEPTION MATCH>ValueError lastChild = self._lastRecursiveChild() nextElement = lastChild.next if self.previous: self.previous.next = nextElement if nextElement: nextElement.previous = self.previous self.previous = None lastChild.next = None self.parent = None if self.previousSibling: self.previousSibling.nextSibling = self.nextSibling if self.nextSibling: self.nextSibling.previousSibling = self.previousSibling self.previousSibling = None self.nextSibling = None return self def _lastRecursiveChild(self): '''Finds the last element beneath this object to be parsed.''' lastChild = self while hasattr(lastChild, 'contents') and lastChild.contents: lastChild = lastChild.contents[-1] return lastChild def insert(self, position, newChild): if (isinstance(newChild, basestring) or isinstance(newChild, unicode)) and not isinstance(newChild, NavigableString): newChild = NavigableString(newChild) position = min(position, len(self.contents)) if hasattr(newChild, 'parent') and newChild.parent != None: if newChild.parent == self: index = self.find(newChild) if index and index < position: position = position - 1 newChild.extract() newChild.parent = self previousChild = None if position == 0: newChild.previousSibling = None newChild.previous = self else: previousChild = self.contents[position - 1] newChild.previousSibling = previousChild newChild.previousSibling.nextSibling = newChild newChild.previous = previousChild._lastRecursiveChild() if newChild.previous: newChild.previous.next = newChild newChildsLastElement = newChild._lastRecursiveChild() if position >= len(self.contents): newChild.nextSibling = None parent = self parentsNextSibling = None while not parentsNextSibling: parentsNextSibling = parent.nextSibling parent = parent.parent if not parent: break continue if parentsNextSibling: newChildsLastElement.next = parentsNextSibling else: newChildsLastElement.next = None else: nextChild = self.contents[position] newChild.nextSibling = nextChild if newChild.nextSibling: newChild.nextSibling.previousSibling = newChild newChildsLastElement.next = nextChild if newChildsLastElement.next: newChildsLastElement.next.previous = newChildsLastElement self.contents.insert(position, newChild) def append(self, tag): '''Appends the given tag to the contents of this tag.''' self.insert(len(self.contents), tag) def findNext(self, name = None, attrs = { }, text = None, **kwargs): '''Returns the first item that matches the given criteria and appears after this Tag in the document.''' return self._findOne(self.findAllNext, name, attrs, text, **kwargs) def findAllNext(self, name = None, attrs = { }, text = None, limit = None, **kwargs): '''Returns all items that match the given criteria and appear after this Tag in the document.''' return self._findAll(name, attrs, text, limit, self.nextGenerator, **kwargs) def findNextSibling(self, name = None, attrs = { }, text = None, **kwargs): '''Returns the closest sibling to this Tag that matches the given criteria and appears after this Tag in the document.''' return self._findOne(self.findNextSiblings, name, attrs, text, **kwargs) def findNextSiblings(self, name = None, attrs = { }, text = None, limit = None, **kwargs): '''Returns the siblings of this Tag that match the given criteria and appear after this Tag in the document.''' return self._findAll(name, attrs, text, limit, self.nextSiblingGenerator, **kwargs) fetchNextSiblings = findNextSiblings def findPrevious(self, name = None, attrs = { }, text = None, **kwargs): '''Returns the first item that matches the given criteria and appears before this Tag in the document.''' return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs) def findAllPrevious(self, name = None, attrs = { }, text = None, limit = None, **kwargs): '''Returns all items that match the given criteria and appear before this Tag in the document.''' return self._findAll(name, attrs, text, limit, self.previousGenerator, **kwargs) fetchPrevious = findAllPrevious def findPreviousSibling(self, name = None, attrs = { }, text = None, **kwargs): '''Returns the closest sibling to this Tag that matches the given criteria and appears before this Tag in the document.''' return self._findOne(self.findPreviousSiblings, name, attrs, text, **kwargs) def findPreviousSiblings(self, name = None, attrs = { }, text = None, limit = None, **kwargs): '''Returns the siblings of this Tag that match the given criteria and appear before this Tag in the document.''' return self._findAll(name, attrs, text, limit, self.previousSiblingGenerator, **kwargs) fetchPreviousSiblings = findPreviousSiblings def findParent(self, name = None, attrs = { }, **kwargs): '''Returns the closest parent of this Tag that matches the given criteria.''' r = None l = self.findParents(name, attrs, 1) if l: r = l[0] return r def findParents(self, name = None, attrs = { }, limit = None, **kwargs): '''Returns the parents of this Tag that match the given criteria.''' return self._findAll(name, attrs, None, limit, self.parentGenerator, **kwargs) fetchParents = findParents def _findOne(self, method, name, attrs, text, **kwargs): r = None l = method(name, attrs, text, 1, **kwargs) if l: r = l[0] return r def _findAll(self, name, attrs, text, limit, generator, **kwargs): '''Iterates over a generator looking for things that match.''' if isinstance(name, SoupStrainer): strainer = name else: strainer = SoupStrainer(name, attrs, text, **kwargs) results = ResultSet(strainer) g = generator() while True: try: i = g.next() except StopIteration: break if i: found = strainer.search(i) if found: results.append(found) if limit and len(results) >= limit: break found return results def nextGenerator(self): i = self while i: i = i.next yield i def nextSiblingGenerator(self): i = self while i: i = i.nextSibling yield i def previousGenerator(self): i = self while i: i = i.previous yield i def previousSiblingGenerator(self): i = self while i: i = i.previousSibling yield i def parentGenerator(self): i = self while i: i = i.parent yield i def substituteEncoding(self, str, encoding = None): if not encoding: pass encoding = 'utf-8' return str.replace('%SOUP-ENCODING%', encoding) def toEncoding(self, s, encoding = None): '''Encodes an object to a string in some encoding, or to Unicode. .''' if isinstance(s, unicode): if encoding: s = s.encode(encoding) elif isinstance(s, str): if encoding: s = s.encode(encoding) else: s = unicode(s) elif encoding: s = self.toEncoding(str(s), encoding) else: s = unicode(s) return s class NavigableString(unicode, PageElement): def __new__(cls, value): """Create a new NavigableString. When unpickling a NavigableString, this method is called with the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be passed in to the superclass's __new__ or the superclass won't know how to handle non-ASCII characters. """ if isinstance(value, unicode): return unicode.__new__(cls, value) return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) def __getnewargs__(self): return (unicode(self),) def __getattr__(self, attr): '''text.string gives you text. This is for backwards compatibility for Navigable*String, but for CData* it lets you get the string without the CData wrapper.''' if attr == 'string': return self raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) def encode(self, encoding = DEFAULT_OUTPUT_ENCODING): return self.decode().encode(encoding) def decodeGivenEventualEncoding(self, eventualEncoding): return self class CData(NavigableString): def decodeGivenEventualEncoding(self, eventualEncoding): return u'<![CDATA[' + self + u']]>' class ProcessingInstruction(NavigableString): def decodeGivenEventualEncoding(self, eventualEncoding): output = self if u'%SOUP-ENCODING%' in output: output = self.substituteEncoding(output, eventualEncoding) return u'<?' + output + u'?>' class Comment(NavigableString): def decodeGivenEventualEncoding(self, eventualEncoding): return u'<!--' + self + u'-->' class Declaration(NavigableString): def decodeGivenEventualEncoding(self, eventualEncoding): return u'<!' + self + u'>' class Tag(PageElement): '''Represents a found HTML tag with its attributes and contents.''' def _invert(h): '''Cheap function to invert a hash.''' i = { } for k, v in h.items(): i[v] = k return i XML_ENTITIES_TO_SPECIAL_CHARS = { 'apos': "'", 'quot': '"', 'amp': '&', 'lt': '<', 'gt': '>' } XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS) def _convertEntities(self, match): '''Used in a call to re.sub to replace HTML, XML, and numeric entities with the appropriate Unicode characters. If HTML entities are being converted, any unrecognized entities are escaped.''' x = match.group(1) if self.convertHTMLEntities and x in name2codepoint: return unichr(name2codepoint[x]) if x in self.XML_ENTITIES_TO_SPECIAL_CHARS: if self.convertXMLEntities: return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] return u'&%s;' % x x in self.XML_ENTITIES_TO_SPECIAL_CHARS if len(x) > 0 and x[0] == '#': if len(x) > 1 and x[1] == 'x': return unichr(int(x[2:], 16)) return unichr(int(x[1:])) x[0] == '#' if self.escapeUnrecognizedEntities: return u'&%s;' % x return u'&%s;' % x def __init__(self, parser, name, attrs = None, parent = None, previous = None): '''Basic constructor.''' self.parserClass = parser.__class__ self.isSelfClosing = parser.isSelfClosingTag(name) self.name = name if attrs == None: attrs = [] self.attrs = attrs self.contents = [] self.setup(parent, previous) self.hidden = False self.containsSubstitutions = False self.convertHTMLEntities = parser.convertHTMLEntities self.convertXMLEntities = parser.convertXMLEntities self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities def convert(kval): '''Converts HTML, XML and numeric entities in the attribute value.''' (k, val) = kval if val is None: return kval return (k, re.sub('&(#\\d+|#x[0-9a-fA-F]+|\\w+);', self._convertEntities, val)) self.attrs = map(convert, self.attrs) def get(self, key, default = None): """Returns the value of the 'key' attribute for the tag, or the value given for 'default' if it doesn't have that attribute.""" return self._getAttrMap().get(key, default) def has_key(self, key): return self._getAttrMap().has_key(key) def __getitem__(self, key): """tag[key] returns the value of the 'key' attribute for the tag, and throws an exception if it's not there.""" return self._getAttrMap()[key] def __iter__(self): '''Iterating over a tag iterates over its contents.''' return iter(self.contents) def __len__(self): '''The length of a tag is the length of its list of contents.''' return len(self.contents) def __contains__(self, x): return x in self.contents def __nonzero__(self): '''A tag is non-None even if it has no contents.''' return True def __setitem__(self, key, value): """Setting tag[key] sets the value of the 'key' attribute for the tag.""" self._getAttrMap() self.attrMap[key] = value found = False for i in range(0, len(self.attrs)): if self.attrs[i][0] == key: self.attrs[i] = (key, value) found = True continue if not found: self.attrs.append((key, value)) self._getAttrMap()[key] = value def __delitem__(self, key): """Deleting tag[key] deletes all 'key' attributes for the tag.""" for item in self.attrs: if item[0] == key: self.attrs.remove(item) self._getAttrMap() if self.attrMap.has_key(key): del self.attrMap[key] continue def __call__(self, *args, **kwargs): """Calling a tag like a function is the same as calling its findAll() method. Eg. tag('a') returns a list of all the A tags found within this tag.""" return apply(self.findAll, args, kwargs) def __getattr__(self, tag): if len(tag) > 3 and tag.rfind('Tag') == len(tag) - 3: return self.find(tag[:-3]) if tag.find('__') != 0: return self.find(tag) raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag) def __eq__(self, other): '''Returns true iff this tag has the same name, the same attributes, and the same contents (recursively) as the given tag. NOTE: right now this will return false if two tags have the same attributes in a different order. Should this be fixed?''' if not hasattr(other, 'name') and not hasattr(other, 'attrs') and not hasattr(other, 'contents') and self.name != other.name and self.attrs != other.attrs or len(self) != len(other): return False for i in range(0, len(self.contents)): if self.contents[i] != other.contents[i]: return False return True def __ne__(self, other): '''Returns true iff this tag is not identical to the other tag, as defined in __eq__.''' return not (self == other) def __repr__(self, encoding = DEFAULT_OUTPUT_ENCODING): '''Renders this tag as a string.''' return self.decode(eventualEncoding = encoding) BARE_AMPERSAND_OR_BRACKET = re.compile('([<>]|' + '&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)' + ')') def _sub_entity(self, x): '''Used with a regular expression to substitute the appropriate XML entity for an XML special character.''' return '&' + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ';' def __unicode__(self): return self.decode() def __str__(self): return self.encode() def encode(self, encoding = DEFAULT_OUTPUT_ENCODING, prettyPrint = False, indentLevel = 0): return self.decode(prettyPrint, indentLevel, encoding).encode(encoding) def decode(self, prettyPrint = False, indentLevel = 0, eventualEncoding = DEFAULT_OUTPUT_ENCODING): '''Returns a string or Unicode representation of this tag and its contents. To get Unicode, pass None for encoding.''' attrs = [] if self.attrs: for key, val in self.attrs: fmt = '%s="%s"' if isString(val): if self.containsSubstitutions and eventualEncoding is not None and '%SOUP-ENCODING%' in val: val = self.substituteEncoding(val, eventualEncoding) if '"' in val: fmt = "%s='%s'" if "'" in val: val = val.replace("'", '&squot;') val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val) if val is None: decoded = key else: decoded = fmt % (key, val) attrs.append(decoded) close = '' closeTag = '' if self.isSelfClosing: close = ' /' else: closeTag = '</%s>' % self.name (indentTag, indentContents) = (0, 0) if prettyPrint: indentTag = indentLevel space = ' ' * (indentTag - 1) indentContents = indentTag + 1 contents = self.decodeContents(prettyPrint, indentContents, eventualEncoding) if self.hidden: s = contents else: s = [] attributeString = '' if attrs: attributeString = ' ' + ' '.join(attrs) if prettyPrint: s.append(space) s.append('<%s%s%s>' % (self.name, attributeString, close)) if prettyPrint: s.append('\n') s.append(contents) if prettyPrint and contents and contents[-1] != '\n': s.append('\n') if prettyPrint and closeTag: s.append(space) s.append(closeTag) if prettyPrint and closeTag and self.nextSibling: s.append('\n') s = ''.join(s) return s def decompose(self): '''Recursively destroys the contents of this tree.''' contents = [ i for i in self.contents ] for i in contents: if isinstance(i, Tag): i.decompose() continue [] i.extract() self.extract() def prettify(self, encoding = DEFAULT_OUTPUT_ENCODING): return self.encode(encoding, True) def encodeContents(self, encoding = DEFAULT_OUTPUT_ENCODING, prettyPrint = False, indentLevel = 0): return self.decodeContents(prettyPrint, indentLevel).encode(encoding) def decodeContents(self, prettyPrint = False, indentLevel = 0, eventualEncoding = DEFAULT_OUTPUT_ENCODING): '''Renders the contents of this tag as a string in the given encoding. If encoding is None, returns a Unicode string..''' s = [] for c in self: text = None if isinstance(c, NavigableString): text = c.decodeGivenEventualEncoding(eventualEncoding) elif isinstance(c, Tag): s.append(c.decode(prettyPrint, indentLevel, eventualEncoding)) if text and prettyPrint: text = text.strip() if text: if prettyPrint: s.append(' ' * (indentLevel - 1)) s.append(text) if prettyPrint: s.append('\n') prettyPrint return ''.join(s) def find(self, name = None, attrs = { }, recursive = True, text = None, **kwargs): '''Return only the first child of this Tag matching the given criteria.''' r = None l = self.findAll(name, attrs, recursive, text, 1, **kwargs) if l: r = l[0] return r findChild = find def findAll(self, name = None, attrs = { }, recursive = True, text = None, limit = None, **kwargs): """Extracts a list of Tag objects that match the given criteria. You can specify the name of the Tag and any attributes you want the Tag to have. The value of a key-value pair in the 'attrs' map can be a string, a list of strings, a regular expression object, or a callable that takes a string and returns whether or not the string matches for some custom definition of 'matches'. The same is true of the tag name.""" generator = self.recursiveChildGenerator if not recursive: generator = self.childGenerator return self._findAll(name, attrs, text, limit, generator, **kwargs) findChildren = findAll first = find fetch = findAll def fetchText(self, text = None, recursive = True, limit = None): return self.findAll(text = text, recursive = recursive, limit = limit) def firstText(self, text = None, recursive = True): return self.find(text = text, recursive = recursive) def renderContents(self, encoding = DEFAULT_OUTPUT_ENCODING, prettyPrint = False, indentLevel = 0): if encoding is None: return self.decodeContents(prettyPrint, indentLevel, encoding) return self.encodeContents(encoding, prettyPrint, indentLevel) def _getAttrMap(self): """Initializes a map representation of this tag's attributes, if not already initialized.""" if not getattr(self, 'attrMap'): self.attrMap = { } for key, value in self.attrs: self.attrMap[key] = value return self.attrMap def recursiveChildGenerator(self): if not len(self.contents): raise StopIteration len(self.contents) stopNode = self._lastRecursiveChild().next current = self.contents[0] while current is not stopNode: yield current current = current.next def childGenerator(self): if not len(self.contents): raise StopIteration len(self.contents) current = self.contents[0] while current: yield current current = current.nextSibling raise StopIteration class SoupStrainer: '''Encapsulates a number of ways of matching a markup element (tag or text).''' def __init__(self, name = None, attrs = { }, text = None, **kwargs): self.name = name if isString(attrs): kwargs['class'] = attrs attrs = None if kwargs: if attrs: attrs = attrs.copy() attrs.update(kwargs) else: attrs = kwargs self.attrs = attrs self.text = text def __str__(self): if self.text: return self.text return '%s|%s' % (self.name, self.attrs) def searchTag(self, markupName = None, markupAttrs = { }): found = None markup = None if isinstance(markupName, Tag): markup = markupName markupAttrs = markup if callable(self.name): pass callFunctionWithTagData = not isinstance(markupName, Tag) if not not (self.name) and callFunctionWithTagData: if (markup or self._matches(markup, self.name) or not markup) and self._matches(markupName, self.name): if callFunctionWithTagData: match = self.name(markupName, markupAttrs) else: match = True markupAttrMap = None for attr, matchAgainst in self.attrs.items(): if not markupAttrMap: if hasattr(markupAttrs, 'get'): markupAttrMap = markupAttrs else: markupAttrMap = { } for k, v in markupAttrs: markupAttrMap[k] = v attrValue = markupAttrMap.get(attr) if not self._matches(attrValue, matchAgainst): match = False break continue if match: if markup: found = markup else: found = markupName return found def search(self, markup): found = None if isList(markup) and not isinstance(markup, Tag): for element in markup: if isinstance(element, NavigableString) and self.search(element): found = element break continue elif isinstance(markup, Tag): if not self.text: found = self.searchTag(markup) elif isinstance(markup, NavigableString) or isString(markup): if self._matches(markup, self.text): found = markup else: raise Exception, "I don't know how to match against a %s" % markup.__class__ return isString(markup) def _matches(self, markup, matchAgainst): result = False if matchAgainst == True and type(matchAgainst) == types.BooleanType: result = markup != None elif callable(matchAgainst): result = matchAgainst(markup) elif isinstance(markup, Tag): markup = markup.name if markup is not None and not isString(markup): markup = unicode(markup) if hasattr(matchAgainst, 'match'): if markup: pass result = matchAgainst.search(markup) elif isList(matchAgainst): if markup is not None or not isString(matchAgainst): result = markup in matchAgainst elif hasattr(matchAgainst, 'items'): result = markup.has_key(matchAgainst) elif matchAgainst and isString(markup): if isinstance(markup, unicode): matchAgainst = unicode(matchAgainst) else: matchAgainst = str(matchAgainst) if not result: result = matchAgainst == markup return result class ResultSet(list): '''A ResultSet is just a list that keeps track of the SoupStrainer that created it.''' def __init__(self, source): list.__init__([]) self.source = source def isList(l): '''Convenience method that works with all 2.x versions of Python to determine whether or not something is listlike.''' if not hasattr(l, '__iter__') or not isString(l): pass return type(l) in (types.ListType, types.TupleType) def isString(s): '''Convenience method that works with all 2.x versions of Python to determine whether or not something is stringlike.''' try: if not isinstance(s, unicode): pass return isinstance(s, basestring) except NameError: return isinstance(s, str) def buildTagMap(default, *args): '''Turns a list of maps, lists, or scalars into a single map. Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and NESTING_RESET_TAGS maps out of lists and partial maps.''' built = { } for portion in args: if hasattr(portion, 'items'): for k, v in portion.items(): built[k] = v if isList(portion) and not isString(portion): for k in portion: built[k] = default built[portion] = default return built class HTMLParserBuilder(HTMLParser): def __init__(self, soup): HTMLParser.__init__(self) self.soup = soup def handle_starttag(self, name, attrs): if name == 'meta': self.soup.extractCharsetFromMeta(attrs) else: self.soup.unknown_starttag(name, attrs) def handle_endtag(self, name): self.soup.unknown_endtag(name) def handle_data(self, content): self.soup.handle_data(content) def _toStringSubclass(self, text, subclass): '''Adds a certain piece of text to the tree as a NavigableString subclass.''' self.soup.endData() self.handle_data(text) self.soup.endData(subclass) def handle_pi(self, text): '''Handle a processing instruction as a ProcessingInstruction object, possibly one with a %SOUP-ENCODING% slot into which an encoding will be plugged later.''' if text[:3] == 'xml': text = u"xml version='1.0' encoding='%SOUP-ENCODING%'" self._toStringSubclass(text, ProcessingInstruction) def handle_comment(self, text): '''Handle comments as Comment objects.''' self._toStringSubclass(text, Comment) def handle_charref(self, ref): '''Handle character references as data.''' if self.soup.convertEntities: data = unichr(int(ref)) else: data = '%s;' % ref self.handle_data(data) def handle_entityref(self, ref): '''Handle entity references as data, possibly converting known HTML and/or XML entity references to the corresponding Unicode characters.''' data = None if self.soup.convertHTMLEntities: try: data = unichr(name2codepoint[ref]) except KeyError: pass except: None<EXCEPTION MATCH>KeyError None<EXCEPTION MATCH>KeyError if not data and self.soup.convertXMLEntities: data = self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref) if not data and self.soup.convertHTMLEntities and not self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref): data = '&%s' % ref if not data: data = '&%s;' % ref self.handle_data(data) def handle_decl(self, data): '''Handle DOCTYPEs and the like as Declaration objects.''' self._toStringSubclass(data, Declaration) def parse_declaration(self, i): '''Treat a bogus SGML declaration as raw data. Treat a CDATA declaration as a CData object.''' j = None if self.rawdata[i:i + 9] == '<![CDATA[': k = self.rawdata.find(']]>', i) if k == -1: k = len(self.rawdata) data = self.rawdata[i + 9:k] j = k + 3 self._toStringSubclass(data, CData) else: try: j = HTMLParser.parse_declaration(self, i) except HTMLParseError: toHandle = self.rawdata[i:] self.handle_data(toHandle) j = i + len(toHandle) return j class BeautifulStoneSoup(Tag): '''This class contains the basic parser and search code. It defines a parser that knows nothing about tag behavior except for the following: You can\'t close a tag without closing all the tags it encloses. That is, "<foo><bar></foo>" actually means "<foo><bar></bar></foo>". [Another possible explanation is "<foo><bar /></foo>", but since this class defines no SELF_CLOSING_TAGS, it will never use that explanation.] This class is useful for parsing XML or made-up markup languages, or when BeautifulSoup makes an assumption counter to what you were expecting.''' SELF_CLOSING_TAGS = { } NESTABLE_TAGS = { } RESET_NESTING_TAGS = { } QUOTE_TAGS = { } PRESERVE_WHITESPACE_TAGS = [] MARKUP_MASSAGE = [ (re.compile('(<[^<>]*)/>'), (lambda x: x.group(1) + ' />')), (re.compile('<!\\s+([^<>]*)>'), (lambda x: '<!' + x.group(1) + '>'))] ROOT_TAG_NAME = u'[document]' HTML_ENTITIES = 'html' XML_ENTITIES = 'xml' XHTML_ENTITIES = 'xhtml' ALL_ENTITIES = XHTML_ENTITIES STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None } def __init__(self, markup = '', parseOnlyThese = None, fromEncoding = None, markupMassage = True, smartQuotesTo = XML_ENTITIES, convertEntities = None, selfClosingTags = None, isHTML = False, builder = HTMLParserBuilder): """The Soup object is initialized as the 'root tag', and the provided markup (which can be a string or a file-like object) is fed into the underlying parser. HTMLParser will process most bad HTML, and the BeautifulSoup class has some tricks for dealing with some HTML that kills HTMLParser, but Beautiful Soup can nonetheless choke or lose data if your data uses self-closing tags or declarations incorrectly. By default, Beautiful Soup uses regexes to sanitize input, avoiding the vast majority of these problems. If the problems don't apply to you, pass in False for markupMassage, and you'll get better performance. The default parser massage techniques fix the two most common instances of invalid HTML that choke HTMLParser: <br/> (No space between name of closing tag and tag close) <! --Comment--> (Extraneous whitespace in declaration) You can pass in a custom list of (RE object, replace method) tuples to get Beautiful Soup to scrub your input the way you want.""" self.parseOnlyThese = parseOnlyThese self.fromEncoding = fromEncoding self.smartQuotesTo = smartQuotesTo self.convertEntities = convertEntities if self.convertEntities: self.smartQuotesTo = None if convertEntities == self.HTML_ENTITIES: self.convertXMLEntities = False self.convertHTMLEntities = True self.escapeUnrecognizedEntities = True elif convertEntities == self.XHTML_ENTITIES: self.convertXMLEntities = True self.convertHTMLEntities = True self.escapeUnrecognizedEntities = False elif convertEntities == self.XML_ENTITIES: self.convertXMLEntities = True self.convertHTMLEntities = False self.escapeUnrecognizedEntities = False else: self.convertXMLEntities = False self.convertHTMLEntities = False self.escapeUnrecognizedEntities = False self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags) self.builder = builder(self) self.reset() if hasattr(markup, 'read'): markup = markup.read() self.markup = markup self.markupMassage = markupMassage try: self._feed(isHTML = isHTML) except StopParsing: pass self.markup = None self.builder = None def _feed(self, inDocumentEncoding = None, isHTML = False): markup = self.markup if isinstance(markup, unicode): if not hasattr(self, 'originalEncoding'): self.originalEncoding = None else: dammit = UnicodeDammit(markup, [ self.fromEncoding, inDocumentEncoding], smartQuotesTo = self.smartQuotesTo, isHTML = isHTML) markup = dammit.unicode self.originalEncoding = dammit.originalEncoding self.declaredHTMLEncoding = dammit.declaredHTMLEncoding if markup: if self.markupMassage: if not isList(self.markupMassage): self.markupMassage = self.MARKUP_MASSAGE for fix, m in self.markupMassage: markup = fix.sub(m, markup) del self.markupMassage self.builder.reset() self.builder.feed(markup) self.endData() while self.currentTag.name != self.ROOT_TAG_NAME: self.popTag() def isSelfClosingTag(self, name): '''Returns true iff the given string is the name of a self-closing tag according to this parser.''' if not self.SELF_CLOSING_TAGS.has_key(name): pass return self.instanceSelfClosingTags.has_key(name) def reset(self): Tag.__init__(self, self, self.ROOT_TAG_NAME) self.hidden = 1 self.builder.reset() self.currentData = [] self.currentTag = None self.tagStack = [] self.quoteStack = [] self.pushTag(self) def popTag(self): tag = self.tagStack.pop() if len(self.currentTag.contents) == 1 and isinstance(self.currentTag.contents[0], NavigableString): self.currentTag.string = self.currentTag.contents[0] if self.tagStack: self.currentTag = self.tagStack[-1] return self.currentTag def pushTag(self, tag): if self.currentTag: self.currentTag.contents.append(tag) self.tagStack.append(tag) self.currentTag = self.tagStack[-1] def endData(self, containerClass = NavigableString): if self.currentData: currentData = u''.join(self.currentData) self.currentData = [] if self.parseOnlyThese and len(self.tagStack) <= 1: if not (self.parseOnlyThese.text) or not self.parseOnlyThese.search(currentData): return None o = containerClass(currentData) o.setup(self.currentTag, self.previous) self.previous = o self.currentTag.contents.append(o) def _popToTag(self, name, inclusivePop = True): '''Pops the tag stack up to and including the most recent instance of the given tag. If inclusivePop is false, pops the tag stack up to but *not* including the most recent instqance of the given tag.''' if name == self.ROOT_TAG_NAME: return None numPops = 0 mostRecentTag = None for i in range(len(self.tagStack) - 1, 0, -1): if name == self.tagStack[i].name: numPops = len(self.tagStack) - i break continue name == self.ROOT_TAG_NAME if not inclusivePop: numPops = numPops - 1 for i in range(0, numPops): mostRecentTag = self.popTag() return mostRecentTag def _smartPop(self, name): """We need to pop up to the previous tag of this type, unless one of this tag's nesting reset triggers comes between this tag and the previous tag of this type, OR unless this tag is a generic nesting trigger and another generic nesting trigger comes between this tag and the previous tag of this type. Examples: <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'. <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'. <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'. <li><ul><li> *<li>* should pop to 'ul', not the first 'li'. <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr' <td><tr><td> *<td>* should pop to 'tr', not the first 'td' """ nestingResetTriggers = self.NESTABLE_TAGS.get(name) isNestable = nestingResetTriggers != None isResetNesting = self.RESET_NESTING_TAGS.has_key(name) popTo = None inclusive = True for i in range(len(self.tagStack) - 1, 0, -1): p = self.tagStack[i] if (not p or p.name == name) and not isNestable: popTo = name break if (nestingResetTriggers != None or p.name in nestingResetTriggers or nestingResetTriggers == None) and isResetNesting and self.RESET_NESTING_TAGS.has_key(p.name): popTo = p.name inclusive = False break p = p.parent if popTo: self._popToTag(popTo, inclusive) def unknown_starttag(self, name, attrs, selfClosing = 0): if self.quoteStack: attrs = ''.join(map((lambda .0: (x, y) = .0' %s="%s"' % (x, y)), attrs)) self.handle_data('<%s%s>' % (name, attrs)) return None self.endData() if not self.isSelfClosingTag(name) and not selfClosing: self._smartPop(name) if self.parseOnlyThese and len(self.tagStack) <= 1: if self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs): return None tag = Tag(self, name, attrs, self.currentTag, self.previous) if self.previous: self.previous.next = tag self.previous = tag self.pushTag(tag) if selfClosing or self.isSelfClosingTag(name): self.popTag() if name in self.QUOTE_TAGS: self.quoteStack.append(name) self.literal = 1 return tag def unknown_endtag(self, name): if self.quoteStack and self.quoteStack[-1] != name: self.handle_data('</%s>' % name) return None self.endData() self._popToTag(name) if self.quoteStack and self.quoteStack[-1] == name: self.quoteStack.pop() self.literal = len(self.quoteStack) > 0 def handle_data(self, data): self.currentData.append(data) def extractCharsetFromMeta(self, attrs): self.unknown_starttag('meta', attrs) class BeautifulSoup(BeautifulStoneSoup): """This parser knows the following facts about HTML: * Some tags have no closing tag and should be interpreted as being closed as soon as they are encountered. * The text inside some tags (ie. 'script') may contain tags which are not really part of the document and which should be parsed as text, not tags. If you want to parse the text as tags, you can always fetch it and parse it explicitly. * Tag nesting rules: Most tags can't be nested at all. For instance, the occurance of a <p> tag should implicitly close the previous <p> tag. <p>Para1<p>Para2 should be transformed into: <p>Para1</p><p>Para2 Some tags can be nested arbitrarily. For instance, the occurance of a <blockquote> tag should _not_ implicitly close the previous <blockquote> tag. Alice said: <blockquote>Bob said: <blockquote>Blah should NOT be transformed into: Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah Some tags can be nested, but the nesting is reset by the interposition of other tags. For instance, a <tr> tag should implicitly close the previous <tr> tag within the same <table>, but not close a <tr> tag in another table. <table><tr>Blah<tr>Blah should be transformed into: <table><tr>Blah</tr><tr>Blah but, <tr>Blah<table><tr>Blah should NOT be transformed into <tr>Blah<table></tr><tr>Blah Differing assumptions about tag nesting rules are a major source of problems with the BeautifulSoup class. If BeautifulSoup is not treating as nestable a tag your page author treats as nestable, try ICantBelieveItsBeautifulSoup, MinimalSoup, or BeautifulStoneSoup before writing your own subclass.""" def __init__(self, *args, **kwargs): if not kwargs.has_key('smartQuotesTo'): kwargs['smartQuotesTo'] = self.HTML_ENTITIES kwargs['isHTML'] = True BeautifulStoneSoup.__init__(self, *args, **kwargs) SELF_CLOSING_TAGS = buildTagMap(None, [ 'br', 'hr', 'input', 'img', 'meta', 'spacer', 'link', 'frame', 'base']) PRESERVE_WHITESPACE_TAGS = set([ 'pre', 'textarea']) QUOTE_TAGS = { 'script': None, 'textarea': None } NESTABLE_INLINE_TAGS = [ 'span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', 'center'] NESTABLE_BLOCK_TAGS = [ 'blockquote', 'div', 'fieldset', 'ins', 'del'] NESTABLE_LIST_TAGS = { 'ol': [], 'ul': [], 'li': [ 'ul', 'ol'], 'dl': [], 'dd': [ 'dl'], 'dt': [ 'dl'] } NESTABLE_TABLE_TAGS = { 'table': [], 'tr': [ 'table', 'tbody', 'tfoot', 'thead'], 'td': [ 'tr'], 'th': [ 'tr'], 'thead': [ 'table'], 'tbody': [ 'table'], 'tfoot': [ 'table'] } NON_NESTABLE_BLOCK_TAGS = [ 'address', 'form', 'p', 'pre'] RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript', NON_NESTABLE_BLOCK_TAGS, NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS) NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS, NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS) CHARSET_RE = re.compile('((^|;)\\s*charset=)([^;]*)', re.M) def extractCharsetFromMeta(self, attrs): '''Beautiful Soup can detect a charset included in a META tag, try to convert the document to that charset, and re-parse the document from the beginning.''' httpEquiv = None contentType = None contentTypeIndex = None tagNeedsEncodingSubstitution = False for i in range(0, len(attrs)): (key, value) = attrs[i] key = key.lower() if key == 'http-equiv': httpEquiv = value continue if key == 'content': contentType = value contentTypeIndex = i continue if httpEquiv and contentType: match = self.CHARSET_RE.search(contentType) if match: if self.declaredHTMLEncoding is not None or self.originalEncoding == self.fromEncoding: def rewrite(match): return match.group(1) + '%SOUP-ENCODING%' newAttr = self.CHARSET_RE.sub(rewrite, contentType) attrs[contentTypeIndex] = (attrs[contentTypeIndex][0], newAttr) tagNeedsEncodingSubstitution = True else: newCharset = match.group(3) if newCharset and newCharset != self.originalEncoding: self.declaredHTMLEncoding = newCharset self._feed(self.declaredHTMLEncoding) raise StopParsing newCharset != self.originalEncoding tag = self.unknown_starttag('meta', attrs) if tag and tagNeedsEncodingSubstitution: tag.containsSubstitutions = True class StopParsing(Exception): pass class ICantBelieveItsBeautifulSoup(BeautifulSoup): '''The BeautifulSoup class is oriented towards skipping over common HTML errors like unclosed tags. However, sometimes it makes errors of its own. For instance, consider this fragment: <b>Foo<b>Bar</b></b> This is perfectly valid (if bizarre) HTML. However, the BeautifulSoup class will implicitly close the first b tag when it encounters the second \'b\'. It will think the author wrote "<b>Foo<b>Bar", and didn\'t close the first \'b\' tag, because there\'s no real-world reason to bold something that\'s already bold. When it encounters \'</b></b>\' it will close two more \'b\' tags, for a grand total of three tags closed instead of two. This can throw off the rest of your document structure. The same is true of a number of other tags, listed below. It\'s much more common for someone to forget to close a \'b\' tag than to actually use nested \'b\' tags, and the BeautifulSoup class handles the common case. This class handles the not-co-common case: where you can\'t believe someone wrote what they did, but it\'s valid HTML and BeautifulSoup screwed up by assuming it wouldn\'t be.''' I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = [ 'em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', 'big'] I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = [ 'noscript'] NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS, I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS, I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS) class MinimalSoup(BeautifulSoup): '''The MinimalSoup class is for parsing HTML that contains pathologically bad markup. It makes no assumptions about tag nesting, but it does know which tags are self-closing, that <script> tags contain Javascript and should not be parsed, that META tags may contain encoding information, and so on. This also makes it better for subclassing than BeautifulStoneSoup or BeautifulSoup.''' RESET_NESTING_TAGS = buildTagMap('noscript') NESTABLE_TAGS = { } class BeautifulSOAP(BeautifulStoneSoup): '''This class will push a tag with only a single string child into the tag\'s parent as an attribute. The attribute\'s name is the tag name, and the value is the string child. An example should give the flavor of the change: <foo><bar>baz</bar></foo> => <foo bar="baz"><bar>baz</bar></foo> You can then access fooTag[\'bar\'] instead of fooTag.barTag.string. This is, of course, useful for scraping structures that tend to use subelements instead of attributes, such as SOAP messages. Note that it modifies its input, so don\'t print the modified version out. I\'m not sure how many people really want to use this class; let me know if you do. Mainly I like the name.''' def popTag(self): if len(self.tagStack) > 1: tag = self.tagStack[-1] parent = self.tagStack[-2] parent._getAttrMap() if isinstance(tag, Tag) and len(tag.contents) == 1 and isinstance(tag.contents[0], NavigableString) and not parent.attrMap.has_key(tag.name): parent[tag.name] = tag.contents[0] BeautifulStoneSoup.popTag(self) class RobustXMLParser(BeautifulStoneSoup): pass class RobustHTMLParser(BeautifulSoup): pass class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup): pass class RobustInsanelyWackAssHTMLParser(MinimalSoup): pass class SimplifyingSOAPParser(BeautifulSOAP): pass try: import chardet except ImportError: chardet = None try: import cjkcodecs.aliases as cjkcodecs except ImportError: pass try: import iconv_codec except ImportError: pass class UnicodeDammit: '''A class for detecting the encoding of a *ML document and converting it to a Unicode string. If the source encoding is windows-1252, can replace MS smart quotes with their HTML or XML equivalents.''' CHARSET_ALIASES = { 'macintosh': 'mac-roman', 'x-sjis': 'shift-jis' } def __init__(self, markup, overrideEncodings = [], smartQuotesTo = 'xml', isHTML = False): self.declaredHTMLEncoding = None (self.markup, documentEncoding, sniffedEncoding) = self._detectEncoding(markup, isHTML) self.smartQuotesTo = smartQuotesTo self.triedEncodings = [] if markup == '' or isinstance(markup, unicode): self.originalEncoding = None self.unicode = unicode(markup) return None u = None for proposedEncoding in overrideEncodings: u = self._convertFrom(proposedEncoding) if u: break continue isinstance(markup, unicode) if not u: for proposedEncoding in (documentEncoding, sniffedEncoding): u = self._convertFrom(proposedEncoding) if u: break continue if not u and chardet and not isinstance(self.markup, unicode): u = self._convertFrom(chardet.detect(self.markup)['encoding']) if not u: for proposed_encoding in ('utf-8', 'windows-1252'): u = self._convertFrom(proposed_encoding) if u: break continue self.unicode = u if not u: self.originalEncoding = None def _subMSChar(self, match): '''Changes a MS smart quote character to an XML or HTML entity.''' orig = match.group(1) sub = self.MS_CHARS.get(orig) if type(sub) == types.TupleType: if self.smartQuotesTo == 'xml': sub = ''.encode() + sub[1].encode() + ';'.encode() else: sub = '&'.encode() + sub[0].encode() + ';'.encode() else: sub = sub.encode() return sub def _convertFrom(self, proposed): proposed = self.find_codec(proposed) if not proposed or proposed in self.triedEncodings: return None self.triedEncodings.append(proposed) markup = self.markup if self.smartQuotesTo and proposed.lower() in ('windows-1252', 'iso-8859-1', 'iso-8859-2'): smart_quotes_re = '([\x80-\x9f])' smart_quotes_compiled = re.compile(smart_quotes_re) markup = smart_quotes_compiled.sub(self._subMSChar, markup) try: u = self._toUnicode(markup, proposed) self.markup = u self.originalEncoding = proposed except Exception: e = None return None return self.markup def _toUnicode(self, data, encoding): '''Given a string and its encoding, decodes the string into Unicode. %encoding is a string recognized by encodings.aliases''' if len(data) >= 4 and data[:2] == '\xfe\xff' and data[2:4] != '\x00\x00': encoding = 'utf-16be' data = data[2:] elif len(data) >= 4 and data[:2] == '\xff\xfe' and data[2:4] != '\x00\x00': encoding = 'utf-16le' data = data[2:] elif data[:3] == '\xef\xbb\xbf': encoding = 'utf-8' data = data[3:] elif data[:4] == '\x00\x00\xfe\xff': encoding = 'utf-32be' data = data[4:] elif data[:4] == '\xff\xfe\x00\x00': encoding = 'utf-32le' data = data[4:] newdata = unicode(data, encoding) return newdata def _detectEncoding(self, xml_data, isHTML = False): '''Given a document, tries to detect its XML encoding.''' xml_encoding = None sniffed_xml_encoding = None try: if xml_data[:4] == 'Lo\xa7\x94': xml_data = self._ebcdic_to_ascii(xml_data) elif xml_data[:4] == '\x00<\x00?': sniffed_xml_encoding = 'utf-16be' xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') elif len(xml_data) >= 4 and xml_data[:2] == '\xfe\xff' and xml_data[2:4] != '\x00\x00': sniffed_xml_encoding = 'utf-16be' xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') elif xml_data[:4] == '<\x00?\x00': sniffed_xml_encoding = 'utf-16le' xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') elif len(xml_data) >= 4 and xml_data[:2] == '\xff\xfe' and xml_data[2:4] != '\x00\x00': sniffed_xml_encoding = 'utf-16le' xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') elif xml_data[:4] == '\x00\x00\x00<': sniffed_xml_encoding = 'utf-32be' xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') elif xml_data[:4] == '<\x00\x00\x00': sniffed_xml_encoding = 'utf-32le' xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') elif xml_data[:4] == '\x00\x00\xfe\xff': sniffed_xml_encoding = 'utf-32be' xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') elif xml_data[:4] == '\xff\xfe\x00\x00': sniffed_xml_encoding = 'utf-32le' xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') elif xml_data[:3] == '\xef\xbb\xbf': sniffed_xml_encoding = 'utf-8' xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') else: sniffed_xml_encoding = 'ascii' except: xml_encoding_match = None xml_encoding_re = '^<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'.encode() xml_encoding_match = re.compile(xml_encoding_re).match(xml_data) if not xml_encoding_match and isHTML: meta_re = '<\\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode() regexp = re.compile(meta_re, re.I) xml_encoding_match = regexp.search(xml_data) if xml_encoding_match is not None: xml_encoding = xml_encoding_match.groups()[0].decode('ascii').lower() if isHTML: self.declaredHTMLEncoding = xml_encoding if sniffed_xml_encoding and xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16'): xml_encoding = sniffed_xml_encoding return (xml_data, xml_encoding, sniffed_xml_encoding) def find_codec(self, charset): if not self._codec(self.CHARSET_ALIASES.get(charset, charset)): if not charset or self._codec(charset.replace('-', '')): if not charset or self._codec(charset.replace('-', '_')): pass return charset def _codec(self, charset): if not charset: return charset codec = None try: codecs.lookup(charset) codec = charset except (LookupError, ValueError): charset charset except: charset return codec EBCDIC_TO_ASCII_MAP = None def _ebcdic_to_ascii(self, s): c = self.__class__ if not c.EBCDIC_TO_ASCII_MAP: emap = (0, 1, 2, 3, 156, 9, 134, 127, 151, 141, 142, 11, 12, 13, 14, 15, 16, 17, 18, 19, 157, 133, 8, 135, 24, 25, 146, 143, 28, 29, 30, 31, 128, 129, 130, 131, 132, 10, 23, 27, 136, 137, 138, 139, 140, 5, 6, 7, 144, 145, 22, 147, 148, 149, 150, 4, 152, 153, 154, 155, 20, 21, 158, 26, 32, 160, 161, 162, 163, 164, 165, 166, 167, 168, 91, 46, 60, 40, 43, 33, 38, 169, 170, 171, 172, 173, 174, 175, 176, 177, 93, 36, 42, 41, 59, 94, 45, 47, 178, 179, 180, 181, 182, 183, 184, 185, 124, 44, 37, 95, 62, 63, 186, 187, 188, 189, 190, 191, 192, 193, 194, 96, 58, 35, 64, 39, 61, 34, 195, 97, 98, 99, 100, 101, 102, 103, 104, 105, 196, 197, 198, 199, 200, 201, 202, 106, 107, 108, 109, 110, 111, 112, 113, 114, 203, 204, 205, 206, 207, 208, 209, 126, 115, 116, 117, 118, 119, 120, 121, 122, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 123, 65, 66, 67, 68, 69, 70, 71, 72, 73, 232, 233, 234, 235, 236, 237, 125, 74, 75, 76, 77, 78, 79, 80, 81, 82, 238, 239, 240, 241, 242, 243, 92, 159, 83, 84, 85, 86, 87, 88, 89, 90, 244, 245, 246, 247, 248, 249, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 250, 251, 252, 253, 254, 255) import string c.EBCDIC_TO_ASCII_MAP = string.maketrans(''.join(map(chr, range(256))), ''.join(map(chr, emap))) return s.translate(c.EBCDIC_TO_ASCII_MAP) MS_CHARS = { '\x80': ('euro', '20AC'), '\x81': ' ', '\x82': ('sbquo', '201A'), '\x83': ('fnof', '192'), '\x84': ('bdquo', '201E'), '\x85': ('hellip', '2026'), '\x86': ('dagger', '2020'), '\x87': ('Dagger', '2021'), '\x88': ('circ', '2C6'), '\x89': ('permil', '2030'), '\x8a': ('Scaron', '160'), '\x8b': ('lsaquo', '2039'), '\x8c': ('OElig', '152'), '\x8d': '?', '\x8e': ('#x17D', '17D'), '\x8f': '?', '\x90': '?', '\x91': ('lsquo', '2018'), '\x92': ('rsquo', '2019'), '\x93': ('ldquo', '201C'), '\x94': ('rdquo', '201D'), '\x95': ('bull', '2022'), '\x96': ('ndash', '2013'), '\x97': ('mdash', '2014'), '\x98': ('tilde', '2DC'), '\x99': ('trade', '2122'), '\x9a': ('scaron', '161'), '\x9b': ('rsaquo', '203A'), '\x9c': ('oelig', '153'), '\x9d': '?', '\x9e': ('#x17E', '17E'), '\x9f': ('Yuml', '') } if __name__ == '__main__': import sys soup = BeautifulSoup(sys.stdin) print soup.prettify()